#!/usr/bin/env Rscript
"> methepic_emseq_wgbs_per-pos_beta.R <
Reads in betas from MethylationEPIC arrays, EM-seq and WGBS, and compares them
on a per-position basis.
For short read tech (EM-seq/WGBS), coverage cutoffs are defined as having
at least 3 of 4 samples from the same method having a coverage of 5 and
above (i.e., EM-seq 3 of 4 with coverage >= 5 && WGBS 3 of 4 with coverage >= 5).
MethylationEPIC readouts are 'analogue' so no 'coverage' to deal with.
" -> doc
setwd('~/csiro/stopwatch/cpgberus/14_methepic_vs_emseq_wgbs/')
# colour codes are from Dark2 panel
EMSEQ_COLOR <- '#1b9e77'
EPIC_COLOR <- '#e7298a'
WGBS_COLOR <- '#7570b3'
suppressPackageStartupMessages({
library(Biostrings)
library(BSgenome.Hsapiens.UCSC.hg38)
library(cowplot)
library(ggplot2)
library(RColorBrewer)
})
# function to pretty-print diagnostic messages
diag_message <- function(...) {
message('[', format(Sys.time(), "%H:%M:%S"), '] ', ...)
}
# function to annotate line of best fit in scatterplot
lm_eqn <- function(df, x, y) {
# modified from https://stackoverflow.com/questions/7549694/add-regression-line-equation-and-r2-on-graph
model <- lm(as.formula(paste(y, '~', x)), df)
eq <- substitute(
italic(y) == c + m %.% italic(x)*','~italic(r)^2 == r2*','~italic(p) == pval*','~italic(n) == n_df,
list(c = format(unname(coef(model)[1]), digits=3),
m = format(unname(coef(model)[2]), digits=3),
r2 = format(summary(model)$r.squared, digits=4),
pval = format(summary(model)$coefficients[2,4], digits=1),
n_df = format(nrow(df), big.mark=',')))
as.character(as.expression(eq))
}
# function to calculate sum/len of a binary string
pct_of_ones <- function(binary_string) {
int_vector <- as.integer(unlist(strsplit(binary_string, split = "")))
sum(int_vector) / length(int_vector) * 100
}
# load processed EPIC data
load('epic_betas_hg38.RData')
# fix EPIC GRanges--all of the current positions refer to the 'C' on the Watson
# strand. this is correct when strand is '+', but not correct when strand is
# '-' (methylated C on Crick would be the G on Watson)
head(epic_gr, 10)
## GRanges object with 10 ranges and 4 metadata columns:
## seqnames ranges strand | WR025V1I WR025V9I WR069V1I
## <Rle> <IRanges> <Rle> | <numeric> <numeric> <numeric>
## cg18478105 chr20 63216298 + | 0.0173053 0.0199615 0.0217486
## cg09835024 chrX 24054523 + | 0.0326714 0.0517594 0.0346174
## cg14361672 chr9 128701657 - | 0.8865378 0.8594833 0.8773512
## cg01763666 chr17 82201630 - | 0.8258608 0.8342132 0.7833352
## cg12950382 chr14 104710399 - | 0.9221174 0.9384120 0.9030496
## cg02115394 chr13 114234693 - | 0.0870580 0.0858822 0.0855477
## cg25813447 chrX 38801258 + | 0.4203698 0.3365862 0.3448609
## cg07779434 chrX 14873227 + | 0.3743789 0.3703045 0.3686351
## cg13417420 chr12 12696225 + | 0.0308294 0.0230513 0.0306978
## cg12480843 chr8 73879050 + | 0.0257384 0.0236206 0.0288161
## WR069V9I
## <numeric>
## cg18478105 0.0168954
## cg09835024 0.0512708
## cg14361672 0.8487040
## cg01763666 0.8522306
## cg12950382 0.9232085
## cg02115394 0.0777868
## cg25813447 0.3797780
## cg07779434 0.3673453
## cg13417420 0.0264349
## cg12480843 0.0262686
## -------
## seqinfo: 25 sequences (1 circular) from hg38 genome
getSeq(BSgenome.Hsapiens.UCSC.hg38, head(epic_gr, 10))
## DNAStringSet object of length 10:
## width seq names
## [1] 1 C cg18478105
## [2] 1 C cg09835024
## [3] 1 G cg14361672
## [4] 1 G cg01763666
## [5] 1 G cg12950382
## [6] 1 G cg02115394
## [7] 1 C cg25813447
## [8] 1 C cg07779434
## [9] 1 C cg13417420
## [10] 1 C cg12480843
ranges(epic_gr)[strand(epic_gr) == '-'] <- shift(ranges(epic_gr)[strand(epic_gr) == '-'], 1)
head(epic_gr, 10)
## GRanges object with 10 ranges and 4 metadata columns:
## seqnames ranges strand | WR025V1I WR025V9I WR069V1I
## <Rle> <IRanges> <Rle> | <numeric> <numeric> <numeric>
## cg18478105 chr20 63216298 + | 0.0173053 0.0199615 0.0217486
## cg09835024 chrX 24054523 + | 0.0326714 0.0517594 0.0346174
## cg14361672 chr9 128701658 - | 0.8865378 0.8594833 0.8773512
## cg01763666 chr17 82201631 - | 0.8258608 0.8342132 0.7833352
## cg12950382 chr14 104710400 - | 0.9221174 0.9384120 0.9030496
## cg02115394 chr13 114234694 - | 0.0870580 0.0858822 0.0855477
## cg25813447 chrX 38801258 + | 0.4203698 0.3365862 0.3448609
## cg07779434 chrX 14873227 + | 0.3743789 0.3703045 0.3686351
## cg13417420 chr12 12696225 + | 0.0308294 0.0230513 0.0306978
## cg12480843 chr8 73879050 + | 0.0257384 0.0236206 0.0288161
## WR069V9I
## <numeric>
## cg18478105 0.0168954
## cg09835024 0.0512708
## cg14361672 0.8487040
## cg01763666 0.8522306
## cg12950382 0.9232085
## cg02115394 0.0777868
## cg25813447 0.3797780
## cg07779434 0.3673453
## cg13417420 0.0264349
## cg12480843 0.0262686
## -------
## seqinfo: 25 sequences (1 circular) from hg38 genome
getSeq(BSgenome.Hsapiens.UCSC.hg38, head(epic_gr, 10))
## DNAStringSet object of length 10:
## width seq names
## [1] 1 C cg18478105
## [2] 1 C cg09835024
## [3] 1 C cg14361672
## [4] 1 C cg01763666
## [5] 1 C cg12950382
## [6] 1 C cg02115394
## [7] 1 C cg25813447
## [8] 1 C cg07779434
## [9] 1 C cg13417420
## [10] 1 C cg12480843
# change beta in EPIC GRanges into methylation levels (x 100%)
values(epic_gr) <- as.matrix(values(epic_gr)) * 100
epic_gr
## GRanges object with 864755 ranges and 4 metadata columns:
## seqnames ranges strand | WR025V1I WR025V9I WR069V1I
## <Rle> <IRanges> <Rle> | <numeric> <numeric> <numeric>
## cg18478105 chr20 63216298 + | 1.73053 1.99615 2.17486
## cg09835024 chrX 24054523 + | 3.26714 5.17594 3.46174
## cg14361672 chr9 128701658 - | 88.65378 85.94833 87.73512
## cg01763666 chr17 82201631 - | 82.58608 83.42132 78.33352
## cg12950382 chr14 104710400 - | 92.21174 93.84120 90.30496
## ... ... ... ... . ... ... ...
## cg23079522 chr3 160851840 + | 81.94656 85.42163 85.21876
## cg16818145 chr3 183064489 + | 81.73503 71.50334 78.85068
## cg14585103 chr8 138928365 + | 68.51512 62.61529 68.91321
## cg10633746 chr17 18261129 - | 8.23758 7.12615 8.98915
## cg12623625 chr1 17620429 - | 54.71393 53.53235 52.03187
## WR069V9I
## <numeric>
## cg18478105 1.68954
## cg09835024 5.12708
## cg14361672 84.87040
## cg01763666 85.22306
## cg12950382 92.32085
## ... ...
## cg23079522 83.80077
## cg16818145 72.00106
## cg14585103 67.36199
## cg10633746 8.68595
## cg12623625 54.87539
## -------
## seqinfo: 25 sequences (1 circular) from hg38 genome
# load per-position data from EM-seq and WGBS
cov_df <- read.delim('rarefied.coverages.tsv.gz')
beta_df <- read.delim('rarefied.methpcts.tsv.gz')
# filtering `beta_df` for positions with min 5 coverage in ALL 8 rarefied
# samples was too stringent: initial union of all 55m positions (covered at
# least once in at least one sample) drops to < 1m post-filtering.
#
# requiring min 3 of 4 EM-seq samples && min 3 of 4 WGBS samples produces a
# more workable set of positions, ~7.7m.
#
# get numbers that summarises coverages across EM-seq and WGBS samples
diag_message('Number of positions with min 5 coverage in 4/4 EM-seq samples: ',
sum(rowSums(cov_df[, grepl('ER$', colnames(cov_df))] >= 5) == 4))
## [17:48:17] Number of positions with min 5 coverage in 4/4 EM-seq samples: 9568385
diag_message('Number of positions with min 5 coverage in 4/4 WGBS samples: ',
sum(rowSums(cov_df[, grepl('WR$', colnames(cov_df))] >= 5) == 4))
## [17:48:25] Number of positions with min 5 coverage in 4/4 WGBS samples: 3274673
diag_message('Number of positions with min 5 coverage in 3/4 EM-seq samples: ',
sum(rowSums(cov_df[, grepl('ER$', colnames(cov_df))] >= 5) >= 3))
## [17:48:27] Number of positions with min 5 coverage in 3/4 EM-seq samples: 27615442
diag_message('Number of positions with min 5 coverage in 3/4 EM-seq samples: ',
sum(rowSums(cov_df[, grepl('WR$', colnames(cov_df))] >= 5) >= 3))
## [17:48:30] Number of positions with min 5 coverage in 3/4 EM-seq samples: 12354217
# ... WGBS not performing too hot here, tsk tsk
# note: for these lines to work, positions in `beta_df` and `cov_df` must have
# the same order (guaranteed by upstream Python script). else `beta_df`
# will not be sliced properly
diag_message('Filter `beta_df` for positions where 3/4 EM-seq samples AND 3/4 WGBS samples have coverage >= 5. ')
## [17:48:32] Filter `beta_df` for positions where 3/4 EM-seq samples AND 3/4 WGBS samples have coverage >= 5.
diag_message('Original nrow: ', nrow(beta_df), '; ')
## [17:48:32] Original nrow: 57672689;
beta_df <- beta_df[rowSums(cov_df[grepl('ER$', colnames(cov_df))] >= 5) >= 3 &
rowSums(cov_df[grepl('WR$', colnames(cov_df))] >= 5) >= 3, ]
cov_df <- cov_df[rowSums(cov_df[grepl('ER$', colnames(cov_df))] >= 5) >= 3 &
rowSums(cov_df[grepl('WR$', colnames(cov_df))] >= 5) >= 3, ]
diag_message('filtered nrow: ', nrow(beta_df))
## [17:48:47] filtered nrow: 7744836
diag_message('Mean coverage of remaining positions are: ',
'EM-seq ', mean(as.matrix(cov_df[grepl('ER$', colnames(cov_df))])), '; ',
'WGBS ', mean(as.matrix(cov_df[grepl('WR$', colnames(cov_df))])))
## [17:48:47] Mean coverage of remaining positions are: EM-seq 7.59187689707051; WGBS 6.70286546416219
# again, WGBS not performing well in terms of coverage
# convert positions from `beta_df` into a GRanges object, so that intersecting
# EPIC data (already in GRanges object) is easy
beta_gr <- GRanges(paste(beta_df$scaffold, beta_df$start_pos, sep=':'))
mcols(beta_gr) <- beta_df[4:ncol(beta_df)]
head(beta_gr)
## GRanges object with 6 ranges and 8 metadata columns:
## seqnames ranges strand | WR025V1ER WR025V1WR WR025V9ER WR025V9WR
## <Rle> <IRanges> <Rle> | <numeric> <numeric> <numeric> <numeric>
## [1] chr1 16244 * | 80.000 77.778 85.714 60
## [2] chr1 17407 * | 88.889 100.000 100.000 100
## [3] chr1 17452 * | 94.118 100.000 100.000 100
## [4] chr1 17453 * | 90.000 100.000 100.000 100
## [5] chr1 17478 * | 100.000 100.000 100.000 100
## [6] chr1 17479 * | 92.857 100.000 92.308 100
## WR069V1ER WR069V1WR WR069V9ER WR069V9WR
## <numeric> <numeric> <numeric> <numeric>
## [1] 71.429 83.333 50.000 77.778
## [2] 100.000 100.000 100.000 83.333
## [3] 100.000 100.000 100.000 100.000
## [4] 100.000 100.000 100.000 100.000
## [5] 100.000 92.308 100.000 100.000
## [6] 87.500 100.000 85.714 100.000
## -------
## seqinfo: 25 sequences from an unspecified genome; no seqlengths
# aggressively subset both GRanges to the intersection of both
diag_message('# positions in `epic_gr` before intersection: ', length(epic_gr))
## [17:49:14] # positions in `epic_gr` before intersection: 864755
epic_gr <- subsetByOverlaps(epic_gr, beta_gr, type='equal', ignore.strand=TRUE)
beta_gr <- subsetByOverlaps(beta_gr, epic_gr, type='equal', ignore.strand=TRUE)
epic_gr <- sort(sortSeqlevels(epic_gr), ignore.strand=TRUE)
beta_gr <- sort(sortSeqlevels(beta_gr), ignore.strand=TRUE)
diag_message('# positions in `epic_gr` after intersection: ', length(epic_gr))
## [17:49:15] # positions in `epic_gr` after intersection: 103670
# sanity check after subsetting: lengths are identical, sequence names are in
# same order, as well as start/end values
stopifnot(length(epic_gr) == length(beta_gr))
stopifnot(identical(seqnames(epic_gr), seqnames(beta_gr)))
stopifnot(identical(start(epic_gr), start(beta_gr)))
stopifnot(identical(end(epic_gr), end(beta_gr)))
# this allows for the cbinding of metadata together. store the metadata
# in `epic_gr`, which is better annotated than `beta_gr`
values(epic_gr) <- cbind(values(beta_gr), values(epic_gr))
# transform the metadata into a proper data.frame (not the 'DataFrame' S4 object
# used in GRanges), then calculate per-method mean methylation levels
wide_df <- as.data.frame(values(epic_gr))
wide_df$meanER <- rowMeans(wide_df[, grepl('ER$', colnames(wide_df))], na.rm=TRUE)
wide_df$meanWR <- rowMeans(wide_df[, grepl('WR$', colnames(wide_df))], na.rm=TRUE)
wide_df$meanI <- rowMeans(wide_df[, grepl('I$', colnames(wide_df))])
head(wide_df)
## WR025V1ER WR025V1WR WR025V9ER WR025V9WR WR069V1ER WR069V1WR
## cg24335620 100.000 85.714 100.000 80.000 100.000 100.000
## cg02288058 100.000 80.000 85.000 85.714 5.882 6.154
## cg11330075 71.429 100.000 80.000 70.000 83.333 60.000
## cg01068023 100.000 100.000 100.000 100.000 100.000 100.000
## cg05440980 100.000 100.000 66.667 100.000 90.000 50.000
## cg25838738 100.000 100.000 100.000 80.000 100.000 100.000
## WR069V9ER WR069V9WR WR025V1I WR025V9I WR069V1I WR069V9I meanER
## cg24335620 92.308 100.000 77.68391 72.99938 77.39380 75.01038 98.07700
## cg02288058 3.774 2.174 37.77661 29.32215 36.55422 30.35048 48.66400
## cg11330075 40.000 100.000 16.61267 16.25537 18.78874 16.81365 68.69050
## cg01068023 100.000 100.000 83.65245 83.75699 83.47850 86.62742 100.00000
## cg05440980 100.000 100.000 90.88851 89.95479 89.70510 90.04642 89.16675
## cg25838738 100.000 100.000 87.04974 86.25925 86.39985 88.13814 100.00000
## meanWR meanI
## cg24335620 91.4285 75.77187
## cg02288058 43.5105 33.50087
## cg11330075 82.5000 17.11761
## cg01068023 100.0000 84.37884
## cg05440980 87.5000 90.14871
## cg25838738 95.0000 86.96174
# plot a PCA to visualise overall profile of datasets. note that PCA doesn't
# like dealing with NA, necessitating the use of complete.cases() to remove
# any rows with NA in them
set.seed(1337)
pca <- prcomp(t(wide_df[complete.cases(wide_df), ]), scale=TRUE)
pca_coords <- as.data.frame(pca$x)
eigs <- pca$sdev ^ 2
pca_df <- data.frame(PC1=pca_coords$PC1,
PC2=pca_coords$PC2,
PC3=pca_coords$PC3,
row.names=rownames(pca_coords))
pca_df$method <- rownames(pca_coords)
pca_df$method <- gsub('^.*ER$', 'EM-seq', pca_df$method)
pca_df$method <- gsub('^.*WR$', 'WGBS', pca_df$method)
pca_df$method <- gsub('^.*I$', 'EPIC', pca_df$method)
pca_df$sample <- gsub('ER$|WR$|I$', '', rownames(pca_df))
g1 <- ggplot(pca_df, aes(x=PC1, y=PC2, color=method, fill=method, shape=sample)) +
geom_point(size=3, alpha=0.5) +
scale_color_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
scale_fill_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
scale_shape_manual(values=21:25) +
xlab(paste0('PC1 (', round(eigs[1] / sum(eigs) * 100, 2), '%)')) +
ylab(paste0('PC2 (', round(eigs[2] / sum(eigs) * 100, 2), '%)')) +
theme_minimal(12) +
theme(legend.position='top', legend.box='vertical', legend.margin=margin())
g2 <- ggplot(pca_df, aes(x=PC2, y=PC3, color=method, fill=method, shape=sample)) +
geom_point(size=3, alpha=0.5) +
scale_color_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
scale_fill_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
scale_shape_manual(values=21:25) +
xlab(paste0('PC2 (', round(eigs[2] / sum(eigs) * 100, 2), '%)')) +
ylab(paste0('PC3 (', round(eigs[3] / sum(eigs) * 100, 2), '%)')) +
theme_minimal(12) +
theme(legend.position='none', legend.box='vertical', legend.margin=margin())
plot_grid(g1, g2, ncol=1, align='v', rel_heights=c(0.55, 0.45))

ggsave('three-way.beta.pca.pdf', width=6, height=6)
# PCA indicates variation is largest across methods (specifically EPIC vs.
# the other two short read methods). There is some variation across samples,
# seen in PC3 (approx PC2 in terms of % variation explained). In PC3, the
# four-sided shapes are on top (WR025) while the three-sided ones are bottom
# (WR069). However, most variation still originate from choice of method,
# hence the calculation of per-method means; subsequent plots are based off
# these mean values
# plot EM-seq vs. WGBS
ggplot(wide_df, aes(x=meanER, y=meanWR)) +
geom_point(size=0.5, alpha=0.05) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanER', 'meanWR'), parse=TRUE) +
ggtitle('Per-position methylation levels, EM-seq vs. WGBS') +
xlab('EM-seq') +
ylab('WGBS') +
theme_minimal(12)

# plot EM-seq vs. EPIC
ggplot(wide_df, aes(x=meanER, y=meanI)) +
geom_point(size=0.5, alpha=0.05) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanER', 'meanI'), parse=TRUE) +
ggtitle('Per-position methylation levels, EM-seq vs. EPIC') +
xlab('EM-seq') +
ylab('EPIC') +
theme_minimal(12)

# WGBS vs. EPIC
ggplot(wide_df, aes(x=meanWR, y=meanI)) +
geom_point(size=0.5, alpha=0.05) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanWR', 'meanI'), parse=TRUE) +
ggtitle('Per-position methylation levels, WGBS vs. EPIC') +
xlab('WGBS') +
ylab('EPIC') +
theme_minimal(12)

# does GC% influence the discrepancies in beta? re-do scatterplot, but colour
# points by GC%
#
# calculate GC% in a window of WINDOW_BP
WINDOW_BP <- 101 # this value should be odd, so that the base at midpoint
# is sandwiched by equal num of bases upstream and downstream
bp_per_side <- (WINDOW_BP - 1) / 2
# letterFrequency(CG) / letterfrequency(ACGT) deals with edge cases where windows
# contain N or non-ACGT bases, which get ignored
window_seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, epic_gr + bp_per_side)
wide_df$gcpct <- as.vector(
letterFrequency(window_seqs, 'CG') / letterFrequency(window_seqs, 'ACGT') * 100)
head(wide_df)
## WR025V1ER WR025V1WR WR025V9ER WR025V9WR WR069V1ER WR069V1WR
## cg24335620 100.000 85.714 100.000 80.000 100.000 100.000
## cg02288058 100.000 80.000 85.000 85.714 5.882 6.154
## cg11330075 71.429 100.000 80.000 70.000 83.333 60.000
## cg01068023 100.000 100.000 100.000 100.000 100.000 100.000
## cg05440980 100.000 100.000 66.667 100.000 90.000 50.000
## cg25838738 100.000 100.000 100.000 80.000 100.000 100.000
## WR069V9ER WR069V9WR WR025V1I WR025V9I WR069V1I WR069V9I meanER
## cg24335620 92.308 100.000 77.68391 72.99938 77.39380 75.01038 98.07700
## cg02288058 3.774 2.174 37.77661 29.32215 36.55422 30.35048 48.66400
## cg11330075 40.000 100.000 16.61267 16.25537 18.78874 16.81365 68.69050
## cg01068023 100.000 100.000 83.65245 83.75699 83.47850 86.62742 100.00000
## cg05440980 100.000 100.000 90.88851 89.95479 89.70510 90.04642 89.16675
## cg25838738 100.000 100.000 87.04974 86.25925 86.39985 88.13814 100.00000
## meanWR meanI gcpct
## cg24335620 91.4285 75.77187 64.35644
## cg02288058 43.5105 33.50087 32.67327
## cg11330075 82.5000 17.11761 49.50495
## cg01068023 100.0000 84.37884 49.50495
## cg05440980 87.5000 90.14871 46.53465
## cg25838738 95.0000 86.96174 40.59406
# check distribution of GC% values, to select a colour scheme for the heatmap
# centered appropriately over the median
quantile(wide_df$gcpct, c(0.05, .1, .5, .9, .95))
## 5% 10% 50% 90% 95%
## 32.67327 34.65347 46.53465 60.39604 63.36634
# hmm. human genome GC% is 42%, but perhaps EPIC probes are preferentially
# chosen around CpG islands, bumping the GC% up slightly. choose a [30, 70]
# colour scale for subsequent plots
#
# plot EM-seq vs. WGBS and colour points by GC%
g3 <- ggplot(wide_df, aes(x=meanER, y=meanWR, color=gcpct)) +
geom_point(size=0.5, alpha=0.2) +
scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
guide=guide_colorbar(direction='horizontal')) +
geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
coord_cartesian(xlim=c(0, 100), ylim=c(0, 100)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanER', 'meanWR'), parse=TRUE) +
ggtitle('Per-position methylation levels') +
xlab('EM-seq (%)') +
ylab('WGBS (%)') +
theme_minimal(12) +
theme(legend.position=c(0.75, 0.1))
# plot EM-seq vs. EPIC and colour points by GC%
g4 <- ggplot(wide_df, aes(x=meanER, y=meanI, color=gcpct)) +
geom_point(size=0.5, alpha=0.2) +
scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
guide=guide_colorbar(direction='horizontal')) +
geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
coord_cartesian(xlim=c(0, 100), ylim=c(0, 100)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanER', 'meanI'), parse=TRUE) +
ggtitle('') +
xlab('EM-seq (%)') +
ylab('EPIC (%)') +
theme_minimal() +
theme(legend.position=c(0.75, 0.1))
# plot WGBS vs. EPIC and colour points by GC%
g5 <- ggplot(wide_df, aes(x=meanWR, y=meanI, color=gcpct)) +
geom_point(size=0.5, alpha=0.2) +
scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
guide=guide_colorbar(direction='horizontal')) +
geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
coord_cartesian(xlim=c(0, 100), ylim=c(0, 100)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanWR', 'meanI'), parse=TRUE) +
ggtitle('') +
xlab('WGBS (%)') +
ylab('EPIC (%)') +
theme_minimal() +
theme(legend.position=c(0.75, 0.1))
# do, like, proper stats
wide_df$delta_wgbs_emseq <- wide_df$meanWR - wide_df$meanER
summary(lm(wide_df$delta_wgbs_emseq ~ wide_df$gcpct))
##
## Call:
## lm(formula = wide_df$delta_wgbs_emseq ~ wide_df$gcpct)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62.561 -4.527 0.354 4.517 53.147
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3372817 0.1436978 -2.347 0.0189 *
## wide_df$gcpct -0.0003291 0.0029985 -0.110 0.9126
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.231 on 103668 degrees of freedom
## Multiple R-squared: 1.162e-07, Adjusted R-squared: -9.53e-06
## F-statistic: 0.01204 on 1 and 103668 DF, p-value: 0.9126
g6 <- ggplot(wide_df, aes(x=gcpct, y=delta_wgbs_emseq)) +
geom_point(size=0.5, alpha=0.05) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
coord_cartesian(xlim=c(20, 80), ylim=c(-50, 50)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'gcpct', 'delta_wgbs_emseq'), parse=TRUE) +
ggtitle('Per-position residual methylation levels vs. GC%') +
xlab('GC%') +
ylab('Residual WGBS - EM-seq (%)') +
theme_minimal(12)
wide_df$delta_ont_emseq <- wide_df$meanI - wide_df$meanER
g7 <- ggplot(wide_df, aes(x=gcpct, y=delta_ont_emseq)) +
geom_point(size=0.5, alpha=0.05) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
coord_cartesian(xlim=c(20, 80), ylim=c(-50, 50)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'gcpct', 'delta_ont_emseq'), parse=TRUE) +
ggtitle('') +
xlab('GC%') +
ylab('Residual EPIC - EM-seq (%)') +
theme_minimal(12)
wide_df$delta_ont_wgbs <- wide_df$meanI - wide_df$meanWR
g8 <- ggplot(wide_df, aes(x=gcpct, y=delta_ont_wgbs)) +
geom_point(size=0.5, alpha=0.05) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
coord_cartesian(xlim=c(20, 80), ylim=c(-50, 50)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'gcpct', 'delta_ont_wgbs'), parse=TRUE) +
ggtitle('') +
xlab('GC%') +
ylab('Residual EPIC - WGBS (%)') +
theme_minimal(12)
plot_grid(g3, g6, g4, g7, g5, g8, ncol=2, rel_heights=c(1, 1, 1),
labels=c('A', 'B', 'C', 'D', 'E', 'F'))

ggsave('three-way.beta_and_gc.pdf', width=10, height=12)
# hmm. there's some interesting trends in the high GC% region. how many positions
# have high GC% in its immediate context?
diag_message('Positions with GC > 70% in immediate context: ', sum(wide_df$gcpct > 70))
## [17:49:47] Positions with GC > 70% in immediate context: 1226
diag_message('Positions with GC > 75% in immediate context: ', sum(wide_df$gcpct > 75))
## [17:49:47] Positions with GC > 75% in immediate context: 235
# don't think analyses using these collection of points would be convincing...
# list deps used in this script
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux bookworm/sid
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blis-openmp/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] RColorBrewer_1.1-2 ggplot2_3.3.5
## [3] cowplot_1.1.1 BSgenome.Hsapiens.UCSC.hg38_1.4.3
## [5] BSgenome_1.60.0 rtracklayer_1.52.1
## [7] GenomicRanges_1.44.0 Biostrings_2.60.2
## [9] GenomeInfoDb_1.28.4 XVector_0.32.0
## [11] IRanges_2.26.0 S4Vectors_0.30.2
## [13] BiocGenerics_0.38.0
##
## loaded via a namespace (and not attached):
## [1] lattice_0.20-45 Rsamtools_2.8.0
## [3] assertthat_0.2.1 digest_0.6.29
## [5] utf8_1.2.2 R6_2.5.1
## [7] evaluate_0.14 highr_0.9
## [9] pillar_1.6.4 zlibbioc_1.38.0
## [11] rlang_0.4.12 jquerylib_0.1.4
## [13] Matrix_1.4-0 rmarkdown_2.11
## [15] splines_4.1.2 labeling_0.4.2
## [17] BiocParallel_1.26.2 stringr_1.4.0
## [19] RCurl_1.98-1.5 munsell_0.5.0
## [21] DelayedArray_0.18.0 compiler_4.1.2
## [23] xfun_0.29 pkgconfig_2.0.3
## [25] mgcv_1.8-38 htmltools_0.5.2
## [27] tidyselect_1.1.1 SummarizedExperiment_1.22.0
## [29] tibble_3.1.6 GenomeInfoDbData_1.2.6
## [31] matrixStats_0.61.0 XML_3.99-0.8
## [33] fansi_1.0.2 withr_2.4.3
## [35] crayon_1.4.2 dplyr_1.0.7
## [37] GenomicAlignments_1.28.0 bitops_1.0-7
## [39] grid_4.1.2 nlme_3.1-155
## [41] DBI_1.1.2 jsonlite_1.7.3
## [43] gtable_0.3.0 lifecycle_1.0.1
## [45] magrittr_2.0.1 scales_1.1.1
## [47] stringi_1.7.6 farver_2.1.0
## [49] bslib_0.3.1 ellipsis_0.3.2
## [51] vctrs_0.3.8 generics_0.1.1
## [53] rjson_0.2.21 restfulr_0.0.13
## [55] tools_4.1.2 Biobase_2.52.0
## [57] glue_1.6.0 purrr_0.3.4
## [59] MatrixGenerics_1.4.3 fastmap_1.1.0
## [61] yaml_2.2.1 colorspace_2.0-2
## [63] knitr_1.37 sass_0.4.0
## [65] BiocIO_1.2.0